{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T10:11:53.573638Z",
     "start_time": "2020-04-01T10:11:52.954068Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score\n",
    "from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold\n",
    "from tqdm import tqdm\n",
    "import xgboost as xgb\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import math\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import warnings\n",
    "import gc\n",
    "from datetime import datetime\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_rows', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T10:11:53.577557Z",
     "start_time": "2020-04-01T10:11:53.575435Z"
    }
   },
   "outputs": [],
   "source": [
    "seed = 2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T10:11:54.013477Z",
     "start_time": "2020-04-01T10:11:53.578780Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature = pd.read_pickle('feature.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T10:11:54.555479Z",
     "start_time": "2020-04-01T10:11:54.014809Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "for f in tqdm(df_feature.select_dtypes('object')):\n",
    "    lbl = LabelEncoder()\n",
    "    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T10:11:55.031052Z",
     "start_time": "2020-04-01T10:11:54.556693Z"
    }
   },
   "outputs": [],
   "source": [
    "df_test = df_feature[df_feature['price'].isnull()].copy()\n",
    "df_train = df_feature[df_feature['price'].notnull()].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T12:19:53.834236Z",
     "start_time": "2020-04-01T10:11:55.032861Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ycol = 'price'\n",
    "feature_names = list(\n",
    "    filter(lambda x: x not in [ycol, 'SaleID', 'regDate', 'creatDate', 'creatDate_year', 'creatDate_month'], df_train.columns))\n",
    "\n",
    "model = xgb.XGBRegressor(num_leaves=64,\n",
    "                         max_depth=8,\n",
    "                         learning_rate=0.08,\n",
    "                         n_estimators=10000000,\n",
    "                         subsample=0.75,\n",
    "                         feature_fraction=0.75,\n",
    "                         reg_alpha=0.7,\n",
    "                         reg_lambda=1.2,\n",
    "                         random_state=seed,\n",
    "                         metric=None,\n",
    "                         tree_method='gpu_hist'\n",
    "                         )\n",
    "\n",
    "oof = []\n",
    "prediction = df_test[['SaleID']]\n",
    "prediction['price'] = 0\n",
    "df_importance_list = []\n",
    "\n",
    "kfold = KFold(n_splits=5, shuffle=False, random_state=seed)\n",
    "for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):\n",
    "    X_train = df_train.iloc[trn_idx][feature_names]\n",
    "    Y_train = df_train.iloc[trn_idx][ycol]\n",
    "\n",
    "    X_val = df_train.iloc[val_idx][feature_names]\n",
    "    Y_val = df_train.iloc[val_idx][ycol]\n",
    "\n",
    "    print('\\nFold_{} Training ================================\\n'.format(fold_id+1))\n",
    "\n",
    "    lgb_model = model.fit(X_train,\n",
    "                          Y_train,\n",
    "                          eval_set=[(X_train, Y_train), (X_val, Y_val)],\n",
    "                          verbose=1000,\n",
    "                          eval_metric='mae',\n",
    "                          early_stopping_rounds=500)\n",
    "\n",
    "    pred_val = lgb_model.predict(\n",
    "        X_val)\n",
    "    df_oof = df_train.iloc[val_idx][['SaleID', ycol]].copy()\n",
    "    df_oof['pred'] = pred_val\n",
    "    oof.append(df_oof)\n",
    "\n",
    "    pred_test = lgb_model.predict(\n",
    "        df_test[feature_names])\n",
    "    prediction['price'] += pred_test / 5\n",
    "\n",
    "    df_importance = pd.DataFrame({\n",
    "        'column': feature_names,\n",
    "        'importance': lgb_model.feature_importances_,\n",
    "    })\n",
    "    df_importance_list.append(df_importance)\n",
    "\n",
    "    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T12:19:53.866498Z",
     "start_time": "2020-04-01T12:19:53.835713Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_importance = pd.concat(df_importance_list)\n",
    "df_importance = df_importance.groupby(['column'])['importance'].agg(\n",
    "    'mean').sort_values(ascending=False).reset_index()\n",
    "df_importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T12:19:54.344228Z",
     "start_time": "2020-04-01T12:19:53.867796Z"
    }
   },
   "outputs": [],
   "source": [
    "df_oof = pd.concat(oof)\n",
    "df_oof[ycol] = np.expm1(df_oof[ycol])\n",
    "df_oof['pred'] = np.expm1(df_oof['pred'])\n",
    "mae = mean_absolute_error(df_oof[ycol], df_oof['pred'])\n",
    "print('mae:', mae)\n",
    "df_oof.to_csv('xgb_oof.csv'.format(mae), index=False, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T12:19:54.618364Z",
     "start_time": "2020-04-01T12:19:54.345642Z"
    }
   },
   "outputs": [],
   "source": [
    "prediction['price'] = np.expm1(prediction['price'])\n",
    "sub = prediction.copy(deep=True)\n",
    "sub.to_csv('sub/xgb_{}.csv'.format(mae), index=False, encoding='utf-8')\n",
    "sub.to_csv('xgb.csv'.format(mae), index=False, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-04-01T12:19:54.625279Z",
     "start_time": "2020-04-01T12:19:54.619753Z"
    }
   },
   "outputs": [],
   "source": [
    "sub.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:dm] *",
   "language": "python",
   "name": "conda-env-dm-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
